import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import math
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook+pdf'
In this notebook we are going to be importing the merged_data and performing EDA analysis
df = pd.read_csv("QVI_cleaned_data.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.head(2)
| Date | Store_no | Loyalty_card_no | Tax_id | Prod_no | Prod_fullname | Prod_qty | Total_sales | Brand | Lifestage | Affluence | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2018-10-19 | 1 | 1000 | 1 | 5 | Natural Chip Compny SeaSalt175g | 2 | 6.0 | Natural Chip Company | YOUNG SINGLES/COUPLES | Premium |
| 1 | 2019-05-16 | 1 | 1307 | 348 | 66 | CCs Nacho Cheese 175g | 3 | 6.3 | CCs | MIDAGE SINGLES/COUPLES | Budget |
t_sales = round(sum(df['Total_sales']),0)
t_quantity = sum(df['Prod_qty'])
t_customers = df['Loyalty_card_no'].agg('count')
avg_order_value = round(t_sales/t_customers,2)
no_of_brands = df['Brand'].nunique()
no_of_stores = df['Store_no'].nunique()
no_of_products = df['Prod_no'].nunique()
Avg_sales_per_store = t_sales/no_of_stores
Avg_qty_per_store = t_quantity/no_of_stores
print('Total Sales : ', t_sales)
print('-'*50)
print('Total Quantity Sold : ', t_quantity)
print('-'*50)
print('Unique Customers : ' ,t_customers)
print('-'*50)
print('Avg order value : ', avg_order_value)
print('-'*50)
print('No. of Brands : ', no_of_brands)
print('-'*50)
print('No. of Stores : ', no_of_stores)
print('-'*50)
print('No. of Products : ', no_of_products)
print('-'*50)
print('Sales per Store : ', Avg_sales_per_store)
print('-'*50)
print('Quantity Sold per Store : ', Avg_qty_per_store)
print('-'*50)
Total Sales : 1934415.0 -------------------------------------------------- Total Quantity Sold : 505124 -------------------------------------------------- Unique Customers : 264836 -------------------------------------------------- Avg order value : 7.3 -------------------------------------------------- No. of Brands : 21 -------------------------------------------------- No. of Stores : 272 -------------------------------------------------- No. of Products : 114 -------------------------------------------------- Sales per Store : 7111.819852941177 -------------------------------------------------- Quantity Sold per Store : 1857.0735294117646 --------------------------------------------------
There are 2 attributes of customer segments given as per the data set .
cg1 = df[['Loyalty_card_no','Lifestage','Affluence']].groupby(['Affluence','Lifestage'])
cdf1 = cg1.agg('count').sort_values(by =['Affluence','Lifestage']).reset_index()
cdf1.rename(columns={'Loyalty_card_no': 'No. of Customers'}, inplace=True)
fig1 = px.pie(cdf1, names='Affluence', values='No. of Customers',
title='Number of Customers by Affluence',
color='Affluence',
color_discrete_sequence=px.colors.qualitative.Antique )
affluence_labels = cdf1['Affluence'].tolist()
fig1.update_traces(text=affluence_labels, textinfo='percent+label')
fig2 = px.pie(cdf1, names='Lifestage', values='No. of Customers',
title='Number of Customers by Lifestage',
color='Lifestage', color_discrete_sequence=px.colors.qualitative.Set2)
life_labels = cdf1['Lifestage'].tolist()
fig2.update_traces(text=life_labels, textinfo='percent+label',textfont_size = 6)
fig1.show()
fig2.show()
From the above charts we can see that :
But when it comes down to Lifestages :
px.bar(cdf1 , x = 'Lifestage', y = 'No. of Customers', color = 'Affluence', title = 'No.of Customers by affluence',
color_discrete_sequence =px.colors.qualitative.Antique )
From this breakup we can state that :
df2 = df.copy()
df2['Year'] = df2['Date'].dt.year
df2['Quarter'] = df2['Date'].dt.quarter
df2['Year_month'] = df2['Date'].dt.strftime('%Y-%m')
sg1 = df2[['Year_month','Total_sales']].groupby('Year_month')
sdf1 = sg1.agg('sum').sort_values(by = 'Year_month').reset_index()
index_to_remove = 12
sdf1 = sdf1.drop(index_to_remove)
sns.lineplot(sdf1, x = 'Year_month', y = 'Total_sales', err_style = None)
plt.xticks(rotation = 45)
plt.show()
sg2 = df2[['Lifestage','Affluence','Total_sales']].groupby(['Affluence','Lifestage'])
sdf2 = sg2.agg('sum').sort_values(by = ['Affluence','Lifestage']).reset_index()
fig = px.sunburst(sdf2, path=['Affluence', 'Lifestage'], values='Total_sales',
title = 'Total sales by customer segments')
fig.show()
From this chart we can see that , there isnt much difference between sales by affluence .
sg3 = df2[['Store_no','Total_sales']].groupby(['Store_no'])
sdf3 = sg3.agg('sum').sort_values(by = 'Total_sales', ascending = False).reset_index()
sdf3['Store_no'] = sdf3['Store_no'].astype(str)
top_10 = sdf3.head(10).sort_values(by = 'Total_sales',ascending = True)
px.bar(top_10, x = 'Total_sales', y = 'Store_no', title = 'Top 10 stores')
Top 10 stores are 226, 88,165,40,58,199,4,203,26
Steps can be takes to improve the sales in all these stores as they are potential markets.
Store no. 226 alone has generated a sale of 18.905K .
Difference between top 1 store and top 10 store is not much
All the stores in top 10 have performed consistently
sdf4 = sg3.agg('sum').sort_values(by = 'Total_sales', ascending = False).reset_index()
sdf4['Store_no'] = sdf4['Store_no'].astype(str)
bottom_10 = sdf4.tail(10)
px.bar(bottom_10, x = 'Total_sales', y = 'Store_no', title = 'Bottom 10 stores')
The least selling store is 211 followed by 76,11,256,and 206
The diffference bettween last and 10 before last is huge .
Store no . 206 only sold for 5.2 dollars.
Steps can be taken down to either improve or shut down the stores all together.
bg1 = df2[['Brand','Total_sales','Prod_qty']].groupby('Brand')
bdf1 = bg1.agg('sum').sort_values(by = 'Total_sales').reset_index()
px.bar(bdf1, x = ['Total_sales','Prod_qty'], y = 'Brand', barmode = 'group')
bdf1.tail(10).sort_values(by = 'Total_sales' ,ascending = False)
| Brand | Total_sales | Prod_qty | |
|---|---|---|---|
| 20 | Kettle | 390239.8 | 79051 |
| 19 | Doritos | 241890.9 | 54216 |
| 18 | Smiths | 224660.2 | 60339 |
| 17 | Pringles | 177655.5 | 48019 |
| 16 | Infuzions | 99047.6 | 27119 |
| 15 | Red Rock Deli | 95046.0 | 33646 |
| 14 | Old El Paso | 90785.1 | 17805 |
| 13 | Thins | 88852.5 | 26929 |
| 12 | Twisties | 81522.1 | 18118 |
| 11 | Tostitos | 79789.6 | 18134 |
The total product quantity sold was 79K.
The top 5 most selling brands are:
bg2 = df2.groupby(['Affluence', 'Brand'])['Total_sales'].sum()
result = bg2.groupby(level=0, group_keys=False).nlargest(5)
# Convert the result to DataFrame for easier manipulation
# Print the result
print(result)
Affluence Brand
Budget Kettle 134407.80
Doritos 83961.25
Smiths 80744.00
Pringles 61201.70
Red Rock Deli 34688.10
Mainstream Kettle 154477.00
Doritos 95638.90
Smiths 84726.20
Pringles 69785.70
Infuzions 38836.00
Premium Kettle 101355.00
Doritos 62290.75
Smiths 59190.00
Pringles 46668.10
Infuzions 25939.60
Name: Total_sales, dtype: float64
bg3 = df2.groupby(['Affluence', 'Brand'])['Total_sales'].sum()
# Find bottom 3 brands based on total sales
result2 = bg3.groupby(level=0, group_keys=False).nsmallest(5)
# Print the result
print(result2)
Affluence Brand
Budget Burger Rings 2527.7
Others 3042.0
Sunbites 3668.6
Cheetos 6071.6
CCs 9732.0
Mainstream Burger Rings 2392.0
Others 2829.0
Sunbites 3354.1
Cheetos 6374.1
CCs 9312.9
Premium Burger Rings 1911.3
Others 2058.0
Sunbites 2653.7
Cheetos 4438.8
CCs 7080.0
Name: Total_sales, dtype: float64
Customer of every affluence category tend to prefer Brands like Kettle Doritos Smiths etc.
Burger Rings ,Subites and Cheetos have performed lowest in all categories
bg4 = df2.groupby(['Lifestage','Brand'])['Total_sales'].sum()
res3 = bg4.groupby(level=0,group_keys=False).nlargest(3)
print(res3)
Lifestage Brand
MIDAGE SINGLES/COUPLES Kettle 38358.20
Doritos 22725.60
Smiths 21148.70
NEW FAMILIES Kettle 10846.20
Doritos 6693.55
Smiths 5293.40
OLDER FAMILIES Kettle 65984.00
Smiths 45097.50
Doritos 43122.90
OLDER SINGLES/COUPLES Kettle 83862.60
Doritos 50985.45
Smiths 45154.00
RETIREES Kettle 76914.80
Doritos 46772.40
Smiths 40147.40
YOUNG FAMILIES Kettle 60033.00
Smiths 39639.70
Doritos 37931.70
YOUNG SINGLES/COUPLES Kettle 54241.00
Doritos 33659.30
Smiths 28179.50
Name: Total_sales, dtype: float64
bg5 = df2.groupby(['Store_no','Brand'])['Total_sales'].sum()
res4 = bg5.groupby(level=0,group_keys=False).nlargest(1).reset_index()
q1 = res4[['Brand', 'Store_no']].groupby('Brand').agg('count').sort_values(by = 'Store_no', ascending = False).reset_index()
px.bar(q1, x = 'Brand', y = 'Store_no', labels = {'Store_no': 'No.of Stores'} ,
title = 'No of stores where brands has topped in sales')
From the above chart alone we can see that kettle has topped sales in 208 stores.
Difference between Kettle and Smiths are huge.
Kettle is the most popular brand accross 76% of the stores , all the affluences and lifestages.
Promotion of brands like Kettle, Smiths and Doritos would increase the sales in bottom performing stores
Our main target market is Older families and Retires .
Steps can be taken to improve our markt among younger people
more toddler friendly products can be introduced to gain more new family customers
here are bottom performing stores ['117', '31', '85', '193', '92', '206', '252', '11', '76', '211']
All the above steps can be taken to improve sales in the above mentioned stores